{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from bs4 import BeautifulSoup\n",
    "import re\n",
    "import numpy as np\n",
    "import tensorflow as tf\n",
    "from sklearn.model_selection import KFold\n",
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_string(string):\n",
    "    string = re.sub('[^A-Za-z0-9\\-\\/ ]+', ' ', string).split()\n",
    "    return [y.strip() for y in string]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def parse_raw(filename):\n",
    "    with open(filename, 'r') as fopen:\n",
    "        entities = fopen.read()\n",
    "    soup = BeautifulSoup(entities, 'html.parser')\n",
    "    inside_tag = ''\n",
    "    texts, labels = [], []\n",
    "    for sentence in soup.prettify().split('\\n'):\n",
    "        if len(inside_tag):\n",
    "            splitted = process_string(sentence)\n",
    "            texts += splitted\n",
    "            labels += [inside_tag] * len(splitted)\n",
    "            inside_tag = ''\n",
    "        else:\n",
    "            if not sentence.find('</'):\n",
    "                pass\n",
    "            elif not sentence.find('<'):\n",
    "                inside_tag = sentence.split('>')[0][1:]\n",
    "            else:\n",
    "                splitted = process_string(sentence)\n",
    "                texts += splitted\n",
    "                labels += ['OTHER'] * len(splitted)\n",
    "    assert (len(texts)==len(labels)), \"length texts and labels are not same\"\n",
    "    print('len texts and labels: ', len(texts))\n",
    "    return texts,labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "len texts and labels:  34012\n",
      "len texts and labels:  9249\n"
     ]
    }
   ],
   "source": [
    "train_texts, train_labels = parse_raw('data_train.txt')\n",
    "test_texts, test_labels = parse_raw('data_test.txt')\n",
    "train_texts += test_texts\n",
    "train_labels += test_labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('entities-bm-normalize-v3.txt','r') as fopen:\n",
    "    entities_bm = fopen.read().split('\\n')[:-1]\n",
    "entities_bm = [i.split() for i in entities_bm]\n",
    "entities_bm = [[i[0],'TIME' if i[0] in 'jam' else i[1]] for i in entities_bm]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'KN'\n",
      "'KA'\n"
     ]
    }
   ],
   "source": [
    "replace_by = {'LOC':'location','PRN':'person','NORP':'organization','ORG':'organization','LAW':'law',\n",
    "             'EVENT':'OTHER','FAC':'organization','TIME':'time','O':'OTHER','ART':'person','DOC':'law'}\n",
    "for i in entities_bm:\n",
    "    try:\n",
    "        string = process_string(i[0])\n",
    "        if len(string):\n",
    "            train_labels.append(replace_by[i[1]])\n",
    "            train_texts.append(process_string(i[0])[0])  \n",
    "    except Exception as e:\n",
    "        print(e)\n",
    "        \n",
    "assert (len(train_texts)==len(train_labels)), \"length texts and labels are not same\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "word2idx = {'PAD': 0,'NUM':1,'UNK':2}\n",
    "tag2idx = {'PAD': 0}\n",
    "char2idx = {'PAD': 0}\n",
    "word_idx = 3\n",
    "tag_idx = 1\n",
    "char_idx = 1\n",
    "\n",
    "def parse_XY(texts, labels):\n",
    "    global word2idx, tag2idx, char2idx, word_idx, tag_idx, char_idx\n",
    "    X, Y = [], []\n",
    "    for no, text in enumerate(texts):\n",
    "        text = text.lower()\n",
    "        tag = labels[no]\n",
    "        for c in text:\n",
    "            if c not in char2idx:\n",
    "                char2idx[c] = char_idx\n",
    "                char_idx += 1\n",
    "        if tag not in tag2idx:\n",
    "            tag2idx[tag] = tag_idx\n",
    "            tag_idx += 1\n",
    "        Y.append(tag2idx[tag])\n",
    "        if text not in word2idx:\n",
    "            word2idx[text] = word_idx\n",
    "            word_idx += 1\n",
    "        X.append(text)\n",
    "    return X, np.array(Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "X, Y = parse_XY(train_texts, train_labels)\n",
    "idx2char = {idx: tag for tag, idx in char2idx.items()}\n",
    "idx2tag = {i: w for w, i in tag2idx.items()}\n",
    "onehot = np.zeros((Y.shape[0],len(tag2idx)))\n",
    "onehot[np.arange(Y.shape[0]),Y] = 1.0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "def str_idx(corpus, dic, UNK=0):\n",
    "    maxlen = max([len(i) for i in corpus])\n",
    "    X = np.zeros((len(corpus),maxlen))\n",
    "    for i in range(len(corpus)):\n",
    "        for no, k in enumerate(corpus[i][:maxlen][::-1]):\n",
    "            try:\n",
    "                X[i,-1 - no]=dic[k]\n",
    "            except Exception as e:\n",
    "                X[i,-1 - no]=UNK\n",
    "    return X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_seq = str_idx(X,char2idx)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "with open('char-bidirectional.json','w') as fopen:\n",
    "    fopen.write(json.dumps({'idx2tag':idx2tag, 'char2idx':char2idx,'tag2idx':tag2idx}))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
      "  \"This module will be removed in 0.20.\", DeprecationWarning)\n"
     ]
    }
   ],
   "source": [
    "from sklearn.cross_validation import train_test_split\n",
    "train_X, test_X, train_Y, test_Y = train_test_split(X_seq, onehot, test_size=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2.2.2\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "import keras\n",
    "print(keras.__version__)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "from keras.models import Model, Input\n",
    "from keras.layers import LSTM, Embedding, Dense, Dropout, Bidirectional\n",
    "from keras.backend.tensorflow_backend import set_session\n",
    "set_session(tf.InteractiveSession())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "input_word = Input(shape=(None,))\n",
    "model = Embedding(input_dim=len(char2idx) + 1, output_dim=128, mask_zero=True)(input_word)\n",
    "model = Bidirectional(LSTM(units=128, return_sequences=False, recurrent_dropout=0.1))(model)\n",
    "out = Dense(len(tag2idx),activation='softmax')(model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = Model(input_word, out)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.compile(optimizer=\"adam\", loss='categorical_crossentropy', metrics=['accuracy'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "_________________________________________________________________\n",
      "Layer (type)                 Output Shape              Param #   \n",
      "=================================================================\n",
      "input_1 (InputLayer)         (None, None)              0         \n",
      "_________________________________________________________________\n",
      "embedding_1 (Embedding)      (None, None, 128)         5120      \n",
      "_________________________________________________________________\n",
      "bidirectional_1 (Bidirection (None, 256)               263168    \n",
      "_________________________________________________________________\n",
      "dense_1 (Dense)              (None, 8)                 2056      \n",
      "=================================================================\n",
      "Total params: 270,344\n",
      "Trainable params: 270,344\n",
      "Non-trainable params: 0\n",
      "_________________________________________________________________\n"
     ]
    }
   ],
   "source": [
    "model.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/svg+xml": [
       "<svg height=\"264pt\" viewBox=\"0.00 0.00 277.00 264.00\" width=\"277pt\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
       "<g class=\"graph\" id=\"graph0\" transform=\"scale(1 1) rotate(0) translate(4 260)\">\n",
       "<title>G</title>\n",
       "<polygon fill=\"white\" points=\"-4,4 -4,-260 273,-260 273,4 -4,4\" stroke=\"none\"/>\n",
       "<!-- 139983722659912 -->\n",
       "<g class=\"node\" id=\"node1\"><title>139983722659912</title>\n",
       "<polygon fill=\"none\" points=\"72,-219.5 72,-255.5 197,-255.5 197,-219.5 72,-219.5\" stroke=\"black\"/>\n",
       "<text font-family=\"Times,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"134.5\" y=\"-233.8\">input_1: InputLayer</text>\n",
       "</g>\n",
       "<!-- 139983722577872 -->\n",
       "<g class=\"node\" id=\"node2\"><title>139983722577872</title>\n",
       "<polygon fill=\"none\" points=\"54,-146.5 54,-182.5 215,-182.5 215,-146.5 54,-146.5\" stroke=\"black\"/>\n",
       "<text font-family=\"Times,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"134.5\" y=\"-160.8\">embedding_1: Embedding</text>\n",
       "</g>\n",
       "<!-- 139983722659912&#45;&gt;139983722577872 -->\n",
       "<g class=\"edge\" id=\"edge1\"><title>139983722659912-&gt;139983722577872</title>\n",
       "<path d=\"M134.5,-219.313C134.5,-211.289 134.5,-201.547 134.5,-192.569\" fill=\"none\" stroke=\"black\"/>\n",
       "<polygon fill=\"black\" points=\"138,-192.529 134.5,-182.529 131,-192.529 138,-192.529\" stroke=\"black\"/>\n",
       "</g>\n",
       "<!-- 139983234241312 -->\n",
       "<g class=\"node\" id=\"node3\"><title>139983234241312</title>\n",
       "<polygon fill=\"none\" points=\"0,-73.5 0,-109.5 269,-109.5 269,-73.5 0,-73.5\" stroke=\"black\"/>\n",
       "<text font-family=\"Times,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"134.5\" y=\"-87.8\">bidirectional_1(lstm_1): Bidirectional(LSTM)</text>\n",
       "</g>\n",
       "<!-- 139983722577872&#45;&gt;139983234241312 -->\n",
       "<g class=\"edge\" id=\"edge2\"><title>139983722577872-&gt;139983234241312</title>\n",
       "<path d=\"M134.5,-146.313C134.5,-138.289 134.5,-128.547 134.5,-119.569\" fill=\"none\" stroke=\"black\"/>\n",
       "<polygon fill=\"black\" points=\"138,-119.529 134.5,-109.529 131,-119.529 138,-119.529\" stroke=\"black\"/>\n",
       "</g>\n",
       "<!-- 139985066388112 -->\n",
       "<g class=\"node\" id=\"node4\"><title>139985066388112</title>\n",
       "<polygon fill=\"none\" points=\"83.5,-0.5 83.5,-36.5 185.5,-36.5 185.5,-0.5 83.5,-0.5\" stroke=\"black\"/>\n",
       "<text font-family=\"Times,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"134.5\" y=\"-14.8\">dense_1: Dense</text>\n",
       "</g>\n",
       "<!-- 139983234241312&#45;&gt;139985066388112 -->\n",
       "<g class=\"edge\" id=\"edge3\"><title>139983234241312-&gt;139985066388112</title>\n",
       "<path d=\"M134.5,-73.3129C134.5,-65.2895 134.5,-55.5475 134.5,-46.5691\" fill=\"none\" stroke=\"black\"/>\n",
       "<polygon fill=\"black\" points=\"138,-46.5288 134.5,-36.5288 131,-46.5289 138,-46.5288\" stroke=\"black\"/>\n",
       "</g>\n",
       "</g>\n",
       "</svg>"
      ],
      "text/plain": [
       "<IPython.core.display.SVG object>"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from IPython.display import SVG\n",
    "from keras.utils.vis_utils import model_to_dot\n",
    "\n",
    "SVG(model_to_dot(model).create(prog='dot', format='svg'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 47341 samples, validate on 5261 samples\n",
      "Epoch 1/2\n",
      "47341/47341 [==============================] - 87s 2ms/step - loss: 0.6457 - acc: 0.8270 - val_loss: 0.5763 - val_acc: 0.8362\n",
      "Epoch 2/2\n",
      "47341/47341 [==============================] - 86s 2ms/step - loss: 0.5193 - acc: 0.8491 - val_loss: 0.4911 - val_acc: 0.8515\n"
     ]
    }
   ],
   "source": [
    "history = model.fit(train_X, train_Y, batch_size=32, epochs=2,\n",
    "                    validation_split=0.1, verbose=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "predicted = model.predict(test_X)\n",
    "labels = [i[0] for i in sorted(tag2idx.items(), key=lambda t: t[1])]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "       OTHER       0.87      0.98      0.92      4729\n",
      "organization       0.48      0.13      0.20       237\n",
      "      person       0.71      0.44      0.54       386\n",
      "        time       0.76      0.44      0.56       145\n",
      "    location       0.64      0.18      0.28       200\n",
      "    quantity       0.63      0.57      0.60       135\n",
      "         law       0.00      0.00      0.00        13\n",
      "\n",
      " avg / total       0.83      0.85      0.83      5845\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n"
     ]
    }
   ],
   "source": [
    "from sklearn.metrics import classification_report\n",
    "print(classification_report(np.argmax(test_Y,1), np.argmax(predicted,1), target_names=labels[1:]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.save_weights('char-bidirectional.h5')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
